Introduction

This report provides an exploratory data analysis (EDA) of the processed Heart Disease Training dataset.


Load Dataset

train <- read.csv("../data/processed/train_scaled.csv", stringsAsFactors = TRUE)

print_basic_info <- function(df, target_col) {
  cat("\nData overview\n")
  print(dim(df))
  str(df)
  
  cat("\n\nSummary:\n")
  print(summary(df))
  
  cat("\nMissing values:\n")
  print(check_missing(df))
  
  cat("\nTarget class balance:\n")
  print(class_balance(df))
}

print_basic_info(train, "HeartDisease")
## 
## Data overview
## [1] 550  12
## 'data.frame':    550 obs. of  12 variables:
##  $ Age           : num  0.0757 0.6163 -1.7625 -0.8975 -1.33 ...
##  $ Sex           : Factor w/ 2 levels "F","M": 2 2 2 2 1 2 1 2 2 1 ...
##  $ ChestPainType : Factor w/ 4 levels "ASY","ATA","NAP",..: 1 1 3 3 2 1 1 1 2 2 ...
##  $ RestingBP     : num  -0.14 -0.563 -0.14 -0.14 -0.405 ...
##  $ Cholesterol   : num  -1.8373 0.3152 -0.0451 0.3429 -0.1375 ...
##  $ FastingBS     : num  1.735 -0.575 -0.575 -0.575 -0.575 ...
##  $ RestingECG    : Factor w/ 3 levels "LVH","Normal",..: 2 2 2 2 2 1 3 2 2 2 ...
##  $ MaxHR         : num  -1.113 -0.83 0.503 0.261 1.715 ...
##  $ ExerciseAngina: Factor w/ 2 levels "N","Y": 2 2 1 1 1 2 2 1 1 1 ...
##  $ Oldpeak       : num  1.946 0.36 -0.853 -0.76 -0.853 ...
##  $ ST_Slope      : Factor w/ 3 levels "Down","Flat",..: 2 1 3 3 3 2 2 2 3 3 ...
##  $ HeartDisease  : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 2 2 2 1 1 ...
## 
## 
## Summary:
##       Age           Sex     ChestPainType   RestingBP        Cholesterol     
##  Min.   :-2.73561   F:126   ASY:289       Min.   :-7.0182   Min.   :-1.8373  
##  1st Qu.:-0.68120   M:424   ATA:112       1st Qu.:-0.6692   1st Qu.:-0.2021  
##  Median : 0.07569           NAP:122       Median :-0.1402   Median : 0.2043  
##  Mean   : 0.00000           TA : 27       Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.72445                         3rd Qu.: 0.3889   3rd Qu.: 0.6270  
##  Max.   : 2.56261                         Max.   : 3.5634   Max.   : 3.3730  
##    FastingBS        RestingECG      MaxHR          ExerciseAngina
##  Min.   :-0.5754   LVH   :113   Min.   :-3.13256   N:333         
##  1st Qu.:-0.5754   Normal:337   1st Qu.:-0.70872   Y:217         
##  Median :-0.5754   ST    :100   Median : 0.09923                 
##  Mean   : 0.0000                Mean   : 0.00000                 
##  3rd Qu.:-0.5754                3rd Qu.: 0.73549                 
##  Max.   : 1.7347                Max.   : 2.60387                 
##     Oldpeak        ST_Slope   HeartDisease
##  Min.   :-2.2531   Down: 37   No :248     
##  1st Qu.:-0.8533   Flat:282   Yes:302     
##  Median :-0.2467   Up  :231               
##  Mean   : 0.0000                          
##  3rd Qu.: 0.5465                          
##  Max.   : 4.9325                          
## 
## Missing values:
##            Age            Sex  ChestPainType      RestingBP    Cholesterol 
##              0              0              0              0              0 
##      FastingBS     RestingECG          MaxHR ExerciseAngina        Oldpeak 
##              0              0              0              0              0 
##       ST_Slope   HeartDisease 
##              0              0 
## 
## Target class balance:
## 
##  No Yes 
## 248 302
numeric_cols <- c("Age", "RestingBP", "Cholesterol",
                  "FastingBS", "MaxHR", "Oldpeak")

cat_cols <- c("Sex", "ChestPainType", "RestingECG",
              "ExerciseAngina", "ST_Slope")

1. Distribution of Target class

pie_data <- train %>% 
  count(HeartDisease) %>% 
  mutate(prop = n / sum(n),
         lbl = paste0(HeartDisease, " (", scales::percent(prop), ")"))

ggplot(pie_data, aes(x = "", y = prop, fill = HeartDisease)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  theme_void() +
  labs(title = "HeartDisease Class Distribution (Pie Chart)") +
  geom_text(aes(label = lbl), position = position_stack(vjust = 0.5))

2. Distribution of Numeric Features

for (col in numeric_cols) {
  print(
    ggplot(train, aes_string(x = col)) +
      geom_histogram(bins = 30, fill = "skyblue", color = "black") +
      theme_minimal() +
      labs(title = paste("Distribution of", col))
  )
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

3. Distribution of Categorical Features

for (col in cat_cols) {
  print(
    ggplot(train, aes_string(x = col)) +
      geom_bar(fill = "orange", color = "black") +
      theme_minimal() +
      labs(title = paste("Distribution of", col))
  )
}

4. Numeric Features vs Heart Disease (Boxplots)

for (col in numeric_cols) {
  print(
    ggplot(train, aes_string(x = "HeartDisease", y = col, fill = "HeartDisease")) +
      geom_boxplot() +
      theme_minimal() +
      labs(title = paste(col, "by HeartDisease"))
  )
}

5. Categorical Features vs Heart Disease (Proportion Plots)

for (col in cat_cols) {
  print(
    ggplot(train, aes_string(x = col, fill = "HeartDisease")) +
      geom_bar(position = "fill") +
      theme_minimal() +
      labs(title = paste(col, "vs HeartDisease (Proportion)"),
           y = "Proportion")
  )
}

6. Correlation Matrix

numeric_data <- train[, numeric_cols]
cor_mat <- cor(numeric_data)

corrplot(cor_mat,
         method = "color",
         type = "upper",
         addCoef.col = "black",
         number.cex = 0.7,
         tl.cex = 0.8,
         tl.col = "black")

7. Scatter Plots (Age vs Key Numeric Features)

key_pairs <- c("RestingBP", "Cholesterol", "MaxHR", "Oldpeak")

for (col in key_pairs) {
  print(
    ggplot(train, aes_string(x = "Age", y = col, color = "HeartDisease")) +
      geom_point(alpha = 0.6, size = 2) +
      theme_minimal() +
      labs(
        title = paste("Scatter Plot:", "Age vs", col),
        x = "Age",
        y = col
      )
  )
}

8. Scatter Plot With Smooth Trend Line

ggplot(train, aes(Age, MaxHR, color = HeartDisease)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "loess", se = TRUE) +
  theme_minimal() +
  labs(title = "Age vs MaxHR with Trend Line")
## `geom_smooth()` using formula = 'y ~ x'

9. Faceted Scatter Plot (MaxHR vs Oldpeak by Sex)

ggplot(train, aes(MaxHR, Oldpeak, color = HeartDisease)) +
  geom_point(alpha = 0.6) +
  facet_wrap(~ Sex) +
  theme_minimal() +
  labs(title = "MaxHR vs Oldpeak (Faceted by Sex)")

10. Pairplot for Numeric Features

ggpairs(
  train[, c(numeric_cols, "HeartDisease")],
  aes(color = HeartDisease, alpha = 0.5)
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.